Machine Learning Project¶

In [ ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import category_encoders as ce
import plotly.subplots as sp
import plotly.graph_objects as go
import plotly.io as pio
import plotly.express as px
from category_encoders import TargetEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
import time
from sklearn.linear_model import SGDRegressor
import joblib

Styling¶

In [ ]:
def styled_describe(df):
    styled = df.describe().style.set_table_styles([
        {'selector': 'th', 'props': [('background', '#606060'), 
                                    ('color', 'white'), 
                                    ('font-family', 'verdana')]},
        {'selector': 'td', 'props': [('font-family', 'verdana')]},
    ])
    return styled


layout_options = {
    'paper_bgcolor':"#383838",
    'plot_bgcolor':'#383838',
    'title_font': dict(color='white'),
    'legend_font': dict(color='white'),
    'yaxis':dict(color="white"),
    'xaxis':dict(color="white")
    }

1. Data Preprocessing and Exploration¶

1.1 Data Loading and Exploring¶

In [ ]:
df = pd.read_csv("nyc-rolling-sales.csv")
df.head()
Out[ ]:
Unnamed: 0 BOROUGH NEIGHBORHOOD BUILDING CLASS CATEGORY TAX CLASS AT PRESENT BLOCK LOT EASE-MENT BUILDING CLASS AT PRESENT ADDRESS ... RESIDENTIAL UNITS COMMERCIAL UNITS TOTAL UNITS LAND SQUARE FEET GROSS SQUARE FEET YEAR BUILT TAX CLASS AT TIME OF SALE BUILDING CLASS AT TIME OF SALE SALE PRICE SALE DATE
0 4 1 ALPHABET CITY 07 RENTALS - WALKUP APARTMENTS 2A 392 6 C2 153 AVENUE B ... 5 0 5 1633 6440 1900 2 C2 6625000 2017-07-19 00:00:00
1 5 1 ALPHABET CITY 07 RENTALS - WALKUP APARTMENTS 2 399 26 C7 234 EAST 4TH STREET ... 28 3 31 4616 18690 1900 2 C7 - 2016-12-14 00:00:00
2 6 1 ALPHABET CITY 07 RENTALS - WALKUP APARTMENTS 2 399 39 C7 197 EAST 3RD STREET ... 16 1 17 2212 7803 1900 2 C7 - 2016-12-09 00:00:00
3 7 1 ALPHABET CITY 07 RENTALS - WALKUP APARTMENTS 2B 402 21 C4 154 EAST 7TH STREET ... 10 0 10 2272 6794 1913 2 C4 3936272 2016-09-23 00:00:00
4 8 1 ALPHABET CITY 07 RENTALS - WALKUP APARTMENTS 2A 404 55 C2 301 EAST 10TH STREET ... 6 0 6 2369 4615 1900 2 C2 8000000 2016-11-17 00:00:00

5 rows × 22 columns

In [ ]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84548 entries, 0 to 84547
Data columns (total 22 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   Unnamed: 0                      84548 non-null  int64 
 1   BOROUGH                         84548 non-null  int64 
 2   NEIGHBORHOOD                    84548 non-null  object
 3   BUILDING CLASS CATEGORY         84548 non-null  object
 4   TAX CLASS AT PRESENT            84548 non-null  object
 5   BLOCK                           84548 non-null  int64 
 6   LOT                             84548 non-null  int64 
 7   EASE-MENT                       84548 non-null  object
 8   BUILDING CLASS AT PRESENT       84548 non-null  object
 9   ADDRESS                         84548 non-null  object
 10  APARTMENT NUMBER                84548 non-null  object
 11  ZIP CODE                        84548 non-null  int64 
 12  RESIDENTIAL UNITS               84548 non-null  int64 
 13  COMMERCIAL UNITS                84548 non-null  int64 
 14  TOTAL UNITS                     84548 non-null  int64 
 15  LAND SQUARE FEET                84548 non-null  object
 16  GROSS SQUARE FEET               84548 non-null  object
 17  YEAR BUILT                      84548 non-null  int64 
 18  TAX CLASS AT TIME OF SALE       84548 non-null  int64 
 19  BUILDING CLASS AT TIME OF SALE  84548 non-null  object
 20  SALE PRICE                      84548 non-null  object
 21  SALE DATE                       84548 non-null  object
dtypes: int64(10), object(12)
memory usage: 14.2+ MB
In [ ]:
test_data_after_save_model = df.iloc[-5:]  
df = df.iloc[:-5]  # All except last 5 rows
test_data_after_save_model.head()
Out[ ]:
Unnamed: 0 BOROUGH NEIGHBORHOOD BUILDING CLASS CATEGORY TAX CLASS AT PRESENT BLOCK LOT EASE-MENT BUILDING CLASS AT PRESENT ADDRESS ... RESIDENTIAL UNITS COMMERCIAL UNITS TOTAL UNITS LAND SQUARE FEET GROSS SQUARE FEET YEAR BUILT TAX CLASS AT TIME OF SALE BUILDING CLASS AT TIME OF SALE SALE PRICE SALE DATE
84543 8409 5 WOODROW 02 TWO FAMILY DWELLINGS 1 7349 34 B9 37 QUAIL LANE ... 2 0 2 2400 2575 1998 1 B9 450000 2016-11-28 00:00:00
84544 8410 5 WOODROW 02 TWO FAMILY DWELLINGS 1 7349 78 B9 32 PHEASANT LANE ... 2 0 2 2498 2377 1998 1 B9 550000 2017-04-21 00:00:00
84545 8411 5 WOODROW 02 TWO FAMILY DWELLINGS 1 7351 60 B2 49 PITNEY AVENUE ... 2 0 2 4000 1496 1925 1 B2 460000 2017-07-05 00:00:00
84546 8412 5 WOODROW 22 STORE BUILDINGS 4 7100 28 K6 2730 ARTHUR KILL ROAD ... 0 7 7 208033 64117 2001 4 K6 11693337 2016-12-21 00:00:00
84547 8413 5 WOODROW 35 INDOOR PUBLIC AND CULTURAL FACILITIES 4 7105 679 P9 155 CLAY PIT ROAD ... 0 1 1 10796 2400 2006 4 P9 69300 2016-10-27 00:00:00

5 rows × 22 columns

In [ ]:
df.tail()
Out[ ]:
Unnamed: 0 BOROUGH NEIGHBORHOOD BUILDING CLASS CATEGORY TAX CLASS AT PRESENT BLOCK LOT EASE-MENT BUILDING CLASS AT PRESENT ADDRESS ... RESIDENTIAL UNITS COMMERCIAL UNITS TOTAL UNITS LAND SQUARE FEET GROSS SQUARE FEET YEAR BUILT TAX CLASS AT TIME OF SALE BUILDING CLASS AT TIME OF SALE SALE PRICE SALE DATE
84538 8404 5 WOODROW 02 TWO FAMILY DWELLINGS 1 7316 61 B2 178 DARNELL LANE ... 2 0 2 3215 1300 1995 1 B2 - 2017-06-30 00:00:00
84539 8405 5 WOODROW 02 TWO FAMILY DWELLINGS 1 7316 85 B2 137 DARNELL LANE ... 2 0 2 3016 1300 1995 1 B2 - 2016-12-30 00:00:00
84540 8406 5 WOODROW 02 TWO FAMILY DWELLINGS 1 7316 93 B2 125 DARNELL LANE ... 2 0 2 3325 1300 1995 1 B2 509000 2016-10-31 00:00:00
84541 8407 5 WOODROW 02 TWO FAMILY DWELLINGS 1 7317 126 B2 112 ROBIN COURT ... 2 0 2 11088 2160 1994 1 B2 648000 2016-12-07 00:00:00
84542 8408 5 WOODROW 02 TWO FAMILY DWELLINGS 1 7339 41 B9 41 SONIA COURT ... 2 0 2 3020 1800 1997 1 B9 - 2016-12-01 00:00:00

5 rows × 22 columns

1.2 Data Cleaning¶

drop unimportant column Unnamed: 0

In [ ]:
df.drop('Unnamed: 0',axis=1,inplace=True)

drop duplicates

In [ ]:
df.duplicated().sum()
df = df.drop_duplicates()
In [ ]:
df.head()
Out[ ]:
BOROUGH NEIGHBORHOOD BUILDING CLASS CATEGORY TAX CLASS AT PRESENT BLOCK LOT EASE-MENT BUILDING CLASS AT PRESENT ADDRESS APARTMENT NUMBER ... RESIDENTIAL UNITS COMMERCIAL UNITS TOTAL UNITS LAND SQUARE FEET GROSS SQUARE FEET YEAR BUILT TAX CLASS AT TIME OF SALE BUILDING CLASS AT TIME OF SALE SALE PRICE SALE DATE
0 1 ALPHABET CITY 07 RENTALS - WALKUP APARTMENTS 2A 392 6 C2 153 AVENUE B ... 5 0 5 1633 6440 1900 2 C2 6625000 2017-07-19 00:00:00
1 1 ALPHABET CITY 07 RENTALS - WALKUP APARTMENTS 2 399 26 C7 234 EAST 4TH STREET ... 28 3 31 4616 18690 1900 2 C7 - 2016-12-14 00:00:00
2 1 ALPHABET CITY 07 RENTALS - WALKUP APARTMENTS 2 399 39 C7 197 EAST 3RD STREET ... 16 1 17 2212 7803 1900 2 C7 - 2016-12-09 00:00:00
3 1 ALPHABET CITY 07 RENTALS - WALKUP APARTMENTS 2B 402 21 C4 154 EAST 7TH STREET ... 10 0 10 2272 6794 1913 2 C4 3936272 2016-09-23 00:00:00
4 1 ALPHABET CITY 07 RENTALS - WALKUP APARTMENTS 2A 404 55 C2 301 EAST 10TH STREET ... 6 0 6 2369 4615 1900 2 C2 8000000 2016-11-17 00:00:00

5 rows × 21 columns

Handle Empty and Whitspaces values¶

In [ ]:
# Identify object columns
object_columns = df.select_dtypes(include=['object']).columns

# Function to replace whitespace and empty values with NaN
def replace_whitespace_and_empty(val):
    if isinstance(val, str) and (val.strip() == '' or val == ''):
        return np.nan
    return val

# Apply the function to each object column
for col in object_columns:
    df[col] = df[col].apply(replace_whitespace_and_empty)

df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 83778 entries, 0 to 84542
Data columns (total 21 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   BOROUGH                         83778 non-null  int64  
 1   NEIGHBORHOOD                    83778 non-null  object 
 2   BUILDING CLASS CATEGORY         83778 non-null  object 
 3   TAX CLASS AT PRESENT            83040 non-null  object 
 4   BLOCK                           83778 non-null  int64  
 5   LOT                             83778 non-null  int64  
 6   EASE-MENT                       0 non-null      float64
 7   BUILDING CLASS AT PRESENT       83040 non-null  object 
 8   ADDRESS                         83778 non-null  object 
 9   APARTMENT NUMBER                18849 non-null  object 
 10  ZIP CODE                        83778 non-null  int64  
 11  RESIDENTIAL UNITS               83778 non-null  int64  
 12  COMMERCIAL UNITS                83778 non-null  int64  
 13  TOTAL UNITS                     83778 non-null  int64  
 14  LAND SQUARE FEET                83778 non-null  object 
 15  GROSS SQUARE FEET               83778 non-null  object 
 16  YEAR BUILT                      83778 non-null  int64  
 17  TAX CLASS AT TIME OF SALE       83778 non-null  int64  
 18  BUILDING CLASS AT TIME OF SALE  83778 non-null  object 
 19  SALE PRICE                      83778 non-null  object 
 20  SALE DATE                       83778 non-null  object 
dtypes: float64(1), int64(9), object(11)
memory usage: 14.1+ MB

Let’s create a hybrid approach for the “SALE DATE” feature. We’ll extract both numeric and categorical components from the date.¶

In [ ]:
# Convert 'Sale Date' to datetime format
df['SALE DATE'] = pd.to_datetime(df['SALE DATE'])

# Extract numeric features
df['DayOfMonth'] = df['SALE DATE'].dt.day
df['Month'] = df['SALE DATE'].dt.month
df['Year'] = df['SALE DATE'].dt.year

# Extract categorical features
df['DayOfWeek'] = df['SALE DATE'].dt.day_name()
df['Season'] = df['SALE DATE'].dt.month.apply(lambda x: 'Spring' if 3 <= x <= 5
                                              else ('Summer' if 6 <= x <= 8
                                                    else ('Fall' if 9 <= x <= 11
                                                          else 'Winter')))
df.drop('SALE DATE',axis=1,inplace=True)
df.head()
Out[ ]:
BOROUGH NEIGHBORHOOD BUILDING CLASS CATEGORY TAX CLASS AT PRESENT BLOCK LOT EASE-MENT BUILDING CLASS AT PRESENT ADDRESS APARTMENT NUMBER ... GROSS SQUARE FEET YEAR BUILT TAX CLASS AT TIME OF SALE BUILDING CLASS AT TIME OF SALE SALE PRICE DayOfMonth Month Year DayOfWeek Season
0 1 ALPHABET CITY 07 RENTALS - WALKUP APARTMENTS 2A 392 6 NaN C2 153 AVENUE B NaN ... 6440 1900 2 C2 6625000 19 7 2017 Wednesday Summer
1 1 ALPHABET CITY 07 RENTALS - WALKUP APARTMENTS 2 399 26 NaN C7 234 EAST 4TH STREET NaN ... 18690 1900 2 C7 - 14 12 2016 Wednesday Winter
2 1 ALPHABET CITY 07 RENTALS - WALKUP APARTMENTS 2 399 39 NaN C7 197 EAST 3RD STREET NaN ... 7803 1900 2 C7 - 9 12 2016 Friday Winter
3 1 ALPHABET CITY 07 RENTALS - WALKUP APARTMENTS 2B 402 21 NaN C4 154 EAST 7TH STREET NaN ... 6794 1913 2 C4 3936272 23 9 2016 Friday Fall
4 1 ALPHABET CITY 07 RENTALS - WALKUP APARTMENTS 2A 404 55 NaN C2 301 EAST 10TH STREET NaN ... 4615 1900 2 C2 8000000 17 11 2016 Thursday Fall

5 rows × 25 columns

Now we have additional features:

  • DayOfMonth (numeric)
  • Month (numeric)
  • Year (numeric)
  • DayOfWeek (categorical)
  • Season (categorical)

Separate features¶

Some numerical features like SALE PRICE considered as object type but it must be integer or float so we had to separate the features manually¶

In [ ]:
numeric_features_columns = [
    'EASE-MENT',
    'APARTMENT NUMBER',
    'BLOCK',
    'LOT',
    'ZIP CODE',
    'RESIDENTIAL UNITS',
    'COMMERCIAL UNITS',
    'TOTAL UNITS',
    'LAND SQUARE FEET', 
    'GROSS SQUARE FEET',
    'SALE PRICE',
]

categorical_features_columns = [
    'BOROUGH',
    'NEIGHBORHOOD',
    'BUILDING CLASS CATEGORY', 
    'TAX CLASS AT PRESENT',
    'BUILDING CLASS AT PRESENT',
    'ADDRESS',
    'BUILDING CLASS AT TIME OF SALE',
    'TAX CLASS AT TIME OF SALE',
    'DayOfMonth',
    'Month',
    'Year',
    'DayOfWeek',
    'Season',
    'YEAR BUILT',
]
In [ ]:
# Convert non-numeric columns to numeric and replace non-numeric values with NaN
for col in numeric_features_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce', downcast='integer')
    
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 83778 entries, 0 to 84542
Data columns (total 25 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   BOROUGH                         83778 non-null  int64  
 1   NEIGHBORHOOD                    83778 non-null  object 
 2   BUILDING CLASS CATEGORY         83778 non-null  object 
 3   TAX CLASS AT PRESENT            83040 non-null  object 
 4   BLOCK                           83778 non-null  int16  
 5   LOT                             83778 non-null  int16  
 6   EASE-MENT                       0 non-null      float64
 7   BUILDING CLASS AT PRESENT       83040 non-null  object 
 8   ADDRESS                         83778 non-null  object 
 9   APARTMENT NUMBER                3802 non-null   float64
 10  ZIP CODE                        83778 non-null  int16  
 11  RESIDENTIAL UNITS               83778 non-null  int16  
 12  COMMERCIAL UNITS                83778 non-null  int16  
 13  TOTAL UNITS                     83778 non-null  int16  
 14  LAND SQUARE FEET                57724 non-null  float64
 15  GROSS SQUARE FEET               56393 non-null  float64
 16  YEAR BUILT                      83778 non-null  int64  
 17  TAX CLASS AT TIME OF SALE       83778 non-null  int64  
 18  BUILDING CLASS AT TIME OF SALE  83778 non-null  object 
 19  SALE PRICE                      69602 non-null  float64
 20  DayOfMonth                      83778 non-null  int32  
 21  Month                           83778 non-null  int32  
 22  Year                            83778 non-null  int32  
 23  DayOfWeek                       83778 non-null  object 
 24  Season                          83778 non-null  object 
dtypes: float64(5), int16(6), int32(3), int64(3), object(8)
memory usage: 12.8+ MB

Colmn EASE-MENT is all empty and column APARTMENT NUMBER is almost empty so we will drop them¶

In [ ]:
numeric_features_columns = [
    'BLOCK',
    'LOT',
    'ZIP CODE',
    'RESIDENTIAL UNITS',
    'COMMERCIAL UNITS',
    'TOTAL UNITS',
    'LAND SQUARE FEET', 
    'GROSS SQUARE FEET',
    'SALE PRICE',
]
df.drop(['APARTMENT NUMBER', 'EASE-MENT'], axis=1, inplace=True)
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 83778 entries, 0 to 84542
Data columns (total 23 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   BOROUGH                         83778 non-null  int64  
 1   NEIGHBORHOOD                    83778 non-null  object 
 2   BUILDING CLASS CATEGORY         83778 non-null  object 
 3   TAX CLASS AT PRESENT            83040 non-null  object 
 4   BLOCK                           83778 non-null  int16  
 5   LOT                             83778 non-null  int16  
 6   BUILDING CLASS AT PRESENT       83040 non-null  object 
 7   ADDRESS                         83778 non-null  object 
 8   ZIP CODE                        83778 non-null  int16  
 9   RESIDENTIAL UNITS               83778 non-null  int16  
 10  COMMERCIAL UNITS                83778 non-null  int16  
 11  TOTAL UNITS                     83778 non-null  int16  
 12  LAND SQUARE FEET                57724 non-null  float64
 13  GROSS SQUARE FEET               56393 non-null  float64
 14  YEAR BUILT                      83778 non-null  int64  
 15  TAX CLASS AT TIME OF SALE       83778 non-null  int64  
 16  BUILDING CLASS AT TIME OF SALE  83778 non-null  object 
 17  SALE PRICE                      69602 non-null  float64
 18  DayOfMonth                      83778 non-null  int32  
 19  Month                           83778 non-null  int32  
 20  Year                            83778 non-null  int32  
 21  DayOfWeek                       83778 non-null  object 
 22  Season                          83778 non-null  object 
dtypes: float64(3), int16(6), int32(3), int64(3), object(8)
memory usage: 11.5+ MB

Tax Class at Present:

Every property in the city is assigned to one of four tax classes (Classes 1, 2, 3, and 4), based on the use of the property.

• Class 1: Includes most residential property of up to three units (such as one-, two-, and three-family homes and small stores or offices with one or two attached apartments), vacant land that is zoned for residential use, and most condominiums that are not more than three stories.

• Class 2: Includes all other property that is primarily residential, such as cooperatives and condominiums.

• Class 3: Includes property with equipment owned by a gas, telephone or electric company.

• Class 4: Includes all other properties not included in class 1,2, and 3, such as offices, factories, warehouses, garage buildings, etc.

so we will filter any value that is not in that one of that four values

In [ ]:
# First convert the column to numeric and replace non-numeric values with NaN
df['TAX CLASS AT PRESENT'] = pd.to_numeric(df['TAX CLASS AT PRESENT'], errors='coerce', downcast='integer')

# Valid tax classes
valid_tax_classes = {1, 2, 3, 4}

# Filter the values based on valid tax classes
df['TAX CLASS AT PRESENT'] = df['TAX CLASS AT PRESENT'].apply(lambda x: x if x in valid_tax_classes else np.nan)
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 83778 entries, 0 to 84542
Data columns (total 23 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   BOROUGH                         83778 non-null  int64  
 1   NEIGHBORHOOD                    83778 non-null  object 
 2   BUILDING CLASS CATEGORY         83778 non-null  object 
 3   TAX CLASS AT PRESENT            75008 non-null  float64
 4   BLOCK                           83778 non-null  int16  
 5   LOT                             83778 non-null  int16  
 6   BUILDING CLASS AT PRESENT       83040 non-null  object 
 7   ADDRESS                         83778 non-null  object 
 8   ZIP CODE                        83778 non-null  int16  
 9   RESIDENTIAL UNITS               83778 non-null  int16  
 10  COMMERCIAL UNITS                83778 non-null  int16  
 11  TOTAL UNITS                     83778 non-null  int16  
 12  LAND SQUARE FEET                57724 non-null  float64
 13  GROSS SQUARE FEET               56393 non-null  float64
 14  YEAR BUILT                      83778 non-null  int64  
 15  TAX CLASS AT TIME OF SALE       83778 non-null  int64  
 16  BUILDING CLASS AT TIME OF SALE  83778 non-null  object 
 17  SALE PRICE                      69602 non-null  float64
 18  DayOfMonth                      83778 non-null  int32  
 19  Month                           83778 non-null  int32  
 20  Year                            83778 non-null  int32  
 21  DayOfWeek                       83778 non-null  object 
 22  Season                          83778 non-null  object 
dtypes: float64(4), int16(6), int32(3), int64(3), object(7)
memory usage: 11.5+ MB

Year Built:

Since the data was built in 2017 so we filter any value after that year and since before 1850 there was no NewYork so we filter any value before that year

In [ ]:
# Filter out the illogical values 
df['YEAR BUILT'] = df['YEAR BUILT'].apply(lambda x: x if x >= 1850 and x <= 2017 else np.nan)
print('Min Year: {0}   Max Year: {1}'.format(df['YEAR BUILT'].max(), df['YEAR BUILT'].min()))
Min Year: 2017.0   Max Year: 1850.0

Year:

The feature represent the year which the property was sold so we will check for every record record that contains the date of sale before the date of build and make the date of sale in the same year of the date build

In [ ]:
df.loc[df['Year'] < df['YEAR BUILT'], 'Year'] = df['YEAR BUILT']

Zip Code:

The data was for NewYork city, so we have to filter out all zip codes that don't belong to NewYork

In [ ]:
nyc_zip_range = (10001, 11697)
df = df[(df['ZIP CODE'] >= nyc_zip_range[0]) & (df['ZIP CODE'] <= nyc_zip_range[1])]

1.3 Handling missing values¶

We will use Median strategy to fill null values for the numeric features and Most Frequent strategy for categorical feature.¶
For numeric feature we will chose Min Max Scaler for scaling in order to avoid negative values in scaling process.¶
In [ ]:
# Define transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
])

# Define the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features_columns),
        ('cat', categorical_transformer, categorical_features_columns)
    ]
)

# Apply the preprocessor to the data
data_processed = preprocessor.fit_transform(df)

# Convert the processed data back to a DataFrame
numeric_cols = preprocessor.transformers_[0][2]
categorical_cols = preprocessor.transformers_[1][2]

df = pd.DataFrame(data_processed, columns=numeric_cols + list(categorical_cols))
df.head()
Out[ ]:
BLOCK LOT ZIP CODE RESIDENTIAL UNITS COMMERCIAL UNITS TOTAL UNITS LAND SQUARE FEET GROSS SQUARE FEET SALE PRICE BOROUGH ... BUILDING CLASS AT PRESENT ADDRESS BUILDING CLASS AT TIME OF SALE TAX CLASS AT TIME OF SALE DayOfMonth Month Year DayOfWeek Season YEAR BUILT
0 392.0 6.0 10009.0 5.0 0.0 5.0 1633.0 6440.0 6625000.0 1 ... C2 153 AVENUE B C2 2 19 7 2017 Wednesday Summer 1900.0
1 399.0 26.0 10009.0 28.0 3.0 31.0 4616.0 18690.0 532500.0 1 ... C7 234 EAST 4TH STREET C7 2 14 12 2016 Wednesday Winter 1900.0
2 399.0 39.0 10009.0 16.0 1.0 17.0 2212.0 7803.0 532500.0 1 ... C7 197 EAST 3RD STREET C7 2 9 12 2016 Friday Winter 1900.0
3 402.0 21.0 10009.0 10.0 0.0 10.0 2272.0 6794.0 3936272.0 1 ... C4 154 EAST 7TH STREET C4 2 23 9 2016 Friday Fall 1913.0
4 404.0 55.0 10009.0 6.0 0.0 6.0 2369.0 4615.0 8000000.0 1 ... C2 301 EAST 10TH STREET C2 2 17 11 2016 Thursday Fall 1900.0

5 rows × 23 columns

In [ ]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82807 entries, 0 to 82806
Data columns (total 23 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   BLOCK                           82807 non-null  object
 1   LOT                             82807 non-null  object
 2   ZIP CODE                        82807 non-null  object
 3   RESIDENTIAL UNITS               82807 non-null  object
 4   COMMERCIAL UNITS                82807 non-null  object
 5   TOTAL UNITS                     82807 non-null  object
 6   LAND SQUARE FEET                82807 non-null  object
 7   GROSS SQUARE FEET               82807 non-null  object
 8   SALE PRICE                      82807 non-null  object
 9   BOROUGH                         82807 non-null  object
 10  NEIGHBORHOOD                    82807 non-null  object
 11  BUILDING CLASS CATEGORY         82807 non-null  object
 12  TAX CLASS AT PRESENT            82807 non-null  object
 13  BUILDING CLASS AT PRESENT       82807 non-null  object
 14  ADDRESS                         82807 non-null  object
 15  BUILDING CLASS AT TIME OF SALE  82807 non-null  object
 16  TAX CLASS AT TIME OF SALE       82807 non-null  object
 17  DayOfMonth                      82807 non-null  object
 18  Month                           82807 non-null  object
 19  Year                            82807 non-null  object
 20  DayOfWeek                       82807 non-null  object
 21  Season                          82807 non-null  object
 22  YEAR BUILT                      82807 non-null  object
dtypes: object(23)
memory usage: 14.5+ MB

separate numeric feature from categorical feature

2. Exploratory Data Analysis (EDA)¶

In [ ]:
df[categorical_features_columns].nunique()
Out[ ]:
BOROUGH                               5
NEIGHBORHOOD                        253
BUILDING CLASS CATEGORY              47
TAX CLASS AT PRESENT                  4
BUILDING CLASS AT PRESENT           164
ADDRESS                           67158
BUILDING CLASS AT TIME OF SALE      164
TAX CLASS AT TIME OF SALE             4
DayOfMonth                           31
Month                                12
Year                                  2
DayOfWeek                             7
Season                                4
YEAR BUILT                          144
dtype: int64

2.1 Data Visualization¶

In [ ]:
def plot_histogram(df, columns, nbins):
    # Plot histograms for categorical columns
    for column in columns:
        fig = px.histogram(df, x=column, nbins=nbins, title=f'Histogram of {column}')
        fig.update_layout(**layout_options)
        fig.show()
plot_histogram(df, categorical_features_columns, 100)

We noticed there is imbalanced data for columns building class category, building class at present, building class at time of sale , Tax class at present, Tax class at time of sale

encoding : neighgourhood, address

grouping rares: building class category, building class at present, building class at time of sale

In [ ]:
def group_rare_categories(df, column, threshold=0.01, other_label="Other"):
    """
    Groups rare categories in a specified column of a DataFrame.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    column (str): The column in which to group rare categories.
    threshold (float or int): The threshold for grouping rare categories. 
                              If float, it represents the fraction of total records (e.g., 0.01 for 1%).
                              If int, it represents the absolute number of occurrences.
    other_label (str): The label to use for grouped categories.

    Returns:
    pd.DataFrame: A DataFrame with the rare categories grouped into 'Other'.
    """

    # If threshold is a float, convert it to an absolute count threshold
    if isinstance(threshold, float):
        threshold = threshold * len(df)

    # Count the occurrences of each category in the column
    category_counts = df[column].value_counts()

    # Identify categories that should be grouped
    rare_categories = category_counts[category_counts < threshold].index

    # Replace rare categories with the 'other_label'
    df[column] = df[column].apply(lambda x: other_label if x in rare_categories else x)

    return df
In [ ]:
grouping_rare_columns = [
    'BUILDING CLASS AT TIME OF SALE',
    'BUILDING CLASS CATEGORY', 
    'BUILDING CLASS AT PRESENT',
]
group_rare_categories(df, 'BUILDING CLASS AT TIME OF SALE', threshold=0.01, other_label="Other")
group_rare_categories(df, 'BUILDING CLASS CATEGORY', threshold=0.025, other_label="Other")
group_rare_categories(df, 'BUILDING CLASS AT PRESENT', threshold=0.01, other_label="Other")
plot_histogram(df, grouping_rare_columns, 100)
In [ ]:
building_class_present_unique_values = df['BUILDING CLASS AT PRESENT'].unique().tolist()
building_class_category_unique_values = df['BUILDING CLASS CATEGORY'].unique().tolist()
building_class_time_unique_values = df['BUILDING CLASS AT TIME OF SALE'].unique().tolist()
print(building_class_present_unique_values, '\n', building_class_category_unique_values,  '\n', building_class_time_unique_values)
['Other', 'C6', 'D4', 'R2', 'R4', 'R1', 'R9', 'A9', 'B3', 'B1', 'C0', 'C3', 'RG', 'B9', 'A5', 'B2', 'R3', 'A1', 'A2', 'V0'] 
 ['07 RENTALS - WALKUP APARTMENTS             ', 'Other', '09 COOPS - WALKUP APARTMENTS               ', '10 COOPS - ELEVATOR APARTMENTS             ', '13 CONDOS - ELEVATOR APARTMENTS            ', '01 ONE FAMILY DWELLINGS                    ', '02 TWO FAMILY DWELLINGS                    ', '03 THREE FAMILY DWELLINGS                  '] 
 ['Other', 'C6', 'D4', 'R2', 'R4', 'R1', 'R9', 'A9', 'B3', 'B1', 'C0', 'C3', 'RG', 'B9', 'A5', 'B2', 'R3', 'A1', 'A2', 'V0']
In [ ]:
# Save these lists to files
joblib.dump(building_class_present_unique_values, 'building_class_present_unique_values.pkl')
joblib.dump(building_class_category_unique_values, 'building_class_category_unique_values.pkl')
joblib.dump(building_class_time_unique_values, 'building_class_time_unique_values.pkl')
Out[ ]:
['building_class_time_unique_values.pkl']

now the features BUILDING CLASS AT TIME OF SALE, BUILDING CLASS CATEGORY, BUILDING CLASS AT PRESENT have almost more balanced data than before

since TAX CLASS AT PRESENT and TAX CLASS AT TIME OF SALE has only 3 values at 3 category, so if the values of these columns are equelevent then we will delete the rows at that values in order to make the distribution of the data more balanced

In [ ]:
tax_class_3 = df[df['TAX CLASS AT PRESENT'] == 3]
print(tax_class_3['TAX CLASS AT PRESENT'].values)
print(tax_class_3['TAX CLASS AT TIME OF SALE'].values)
[3.0]
[3]
In [ ]:
df = df[df["TAX CLASS AT TIME OF SALE"] != 3]
plot_histogram(df, ['TAX CLASS AT TIME OF SALE', 'TAX CLASS AT PRESENT'], nbins=100)

For the value 4 that has low occurences compared to 1 and 2 (Skewed data) we could use Oversampling or Undersampling to handle that variance in the distribution but each one of them has disadvantages:

Undersampling Disadvantages:

-Risk of losing valuable information from the majority class, which could lead to underfitting.

-The model might not have enough data to learn general patterns from the majority class.

Oversampling Disadvantages:

-Random oversampling can lead to overfitting since the model might learn specific duplicated examples rather than general patterns.

-Synthetic oversampling might introduce noise if not carefully applied.

So we will use Randomforest with Cross-validation for model training and selection so that will helps avoid the impact of skewed data on the model.

2.2 Feature Engineering¶

Check for outliers¶

In [ ]:
# Create subplots with two columns
def make_box_plot(df, numeric_cols):
    fig = sp.make_subplots(rows=len(numeric_cols) // 2 + len(numeric_cols) % 2, cols=2, vertical_spacing=0.03, horizontal_spacing=0.3)


    # Add box plots to the subplots
    for i, column in enumerate(numeric_cols, start=1):
        fig.add_trace(go.Box(y=df[column], name=column), row=(i-1)//2 + 1, col=(i-1)%2 + 1)

    # Adjust the layout
    fig.update_layout( **layout_options, height=150*len(numeric_cols), width=950)

    # Change the color of subplot titles to white
    for annotation in fig['layout']['annotations']: 
        annotation['font'] = dict(color='white')
        
    # Change the color of axis titles and tick labels to white
    for axis in fig.layout:
        if type(fig.layout[axis]) == go.layout.XAxis:
            fig.layout[axis].title.font.color = 'white'
            fig.layout[axis].tickfont.color = 'white'
        elif type(fig.layout[axis]) == go.layout.YAxis:
            fig.layout[axis].title.font.color = 'white'
            fig.layout[axis].tickfont.color = 'white'
    pio.renderers.default = "notebook"  # Change to "iframe" or "svg" if issues persist
    # Show the plot
    fig.show()
    
    
make_box_plot(df, numeric_cols)

Handling outliers using IQR¶

In [ ]:
# Define the columns that you want to check for outliers
outlier_columns = [
    'BLOCK',
    'LOT',
    'RESIDENTIAL UNITS',
    'COMMERCIAL UNITS',
    'TOTAL UNITS',
    'LAND SQUARE FEET', 
    'GROSS SQUARE FEET',
    'SALE PRICE',
]

# Function to remove outliers iteratively
def remove_outliers(df, columns):
    for column in columns:
        # while True:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1

        # Define the acceptable range
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Identify the outliers
        # initial_size = df.shape[0]
        df[column] = df[column].where((df[column] >= lower_bound) & (df[column] <= upper_bound), df[column].median())

            # If no rows were removed, break the loop
            # if df.shape[0] == initial_size:
            #     break

    return df

# Apply the iterative outlier removal process
df = remove_outliers(df, outlier_columns)

# Check the cleaned dataset
make_box_plot(df, numeric_cols)
In [ ]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 82806 entries, 0 to 82806
Data columns (total 23 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   BLOCK                           82806 non-null  object
 1   LOT                             82806 non-null  object
 2   ZIP CODE                        82806 non-null  object
 3   RESIDENTIAL UNITS               82806 non-null  object
 4   COMMERCIAL UNITS                82806 non-null  object
 5   TOTAL UNITS                     82806 non-null  object
 6   LAND SQUARE FEET                82806 non-null  object
 7   GROSS SQUARE FEET               82806 non-null  object
 8   SALE PRICE                      82806 non-null  object
 9   BOROUGH                         82806 non-null  object
 10  NEIGHBORHOOD                    82806 non-null  object
 11  BUILDING CLASS CATEGORY         82806 non-null  object
 12  TAX CLASS AT PRESENT            82806 non-null  object
 13  BUILDING CLASS AT PRESENT       82806 non-null  object
 14  ADDRESS                         82806 non-null  object
 15  BUILDING CLASS AT TIME OF SALE  82806 non-null  object
 16  TAX CLASS AT TIME OF SALE       82806 non-null  object
 17  DayOfMonth                      82806 non-null  object
 18  Month                           82806 non-null  object
 19  Year                            82806 non-null  object
 20  DayOfWeek                       82806 non-null  object
 21  Season                          82806 non-null  object
 22  YEAR BUILT                      82806 non-null  object
dtypes: object(23)
memory usage: 15.2+ MB

it turns out that there is some column that has Extreme Outliers:

The filtering might have removed some outliers, but the remaining data still have a wide range or other extreme values that extend the whiskers of the box plot.

The outliers might be very close to the IQR boundaries, so the filtering doesn't remove them, or they might not be considered outliers by the IQR method.

also we tried another approche which is Z-score, and the result is the same so we will let scaling handle that¶

In [ ]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 82806 entries, 0 to 82806
Data columns (total 23 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   BLOCK                           82806 non-null  object
 1   LOT                             82806 non-null  object
 2   ZIP CODE                        82806 non-null  object
 3   RESIDENTIAL UNITS               82806 non-null  object
 4   COMMERCIAL UNITS                82806 non-null  object
 5   TOTAL UNITS                     82806 non-null  object
 6   LAND SQUARE FEET                82806 non-null  object
 7   GROSS SQUARE FEET               82806 non-null  object
 8   SALE PRICE                      82806 non-null  object
 9   BOROUGH                         82806 non-null  object
 10  NEIGHBORHOOD                    82806 non-null  object
 11  BUILDING CLASS CATEGORY         82806 non-null  object
 12  TAX CLASS AT PRESENT            82806 non-null  object
 13  BUILDING CLASS AT PRESENT       82806 non-null  object
 14  ADDRESS                         82806 non-null  object
 15  BUILDING CLASS AT TIME OF SALE  82806 non-null  object
 16  TAX CLASS AT TIME OF SALE       82806 non-null  object
 17  DayOfMonth                      82806 non-null  object
 18  Month                           82806 non-null  object
 19  Year                            82806 non-null  object
 20  DayOfWeek                       82806 non-null  object
 21  Season                          82806 non-null  object
 22  YEAR BUILT                      82806 non-null  object
dtypes: object(23)
memory usage: 15.2+ MB
For some features We will encode the categorica feature using Frequency Encoding because we have high cardinality categorical features (some features have large amount of categories and this will a large amount of columns if we use One Hot Encoder), also Label Encoder is bad since we have a large amount of categories, Target Encoding is powerful but risk overfitting.¶

for other features that does not have high cardinality we will use Label Encoder.¶

In [ ]:
# Initialize separate LabelEncoders for each column
label_encoder_dayofweek = LabelEncoder()
label_encoder_season = LabelEncoder()
label_encoder_bldg_class_at_sale = LabelEncoder()
label_encoder_bldg_class_cat = LabelEncoder()
label_encoder_bldg_class_present = LabelEncoder()

# Fit and transform each column with its own LabelEncoder
df['DayOfWeek'] = label_encoder_dayofweek.fit_transform(df['DayOfWeek'])
df['Season'] = label_encoder_season.fit_transform(df['Season'])
df['BUILDING CLASS AT TIME OF SALE'] = label_encoder_bldg_class_at_sale.fit_transform(df['BUILDING CLASS AT TIME OF SALE'])
df['BUILDING CLASS CATEGORY'] = label_encoder_bldg_class_cat.fit_transform(df['BUILDING CLASS CATEGORY'])
df['BUILDING CLASS AT PRESENT'] = label_encoder_bldg_class_present.fit_transform(df['BUILDING CLASS AT PRESENT'])
In [ ]:
# Save each LabelEncoder individually
joblib.dump(label_encoder_dayofweek, 'label_encoder_dayofweek.pkl')
joblib.dump(label_encoder_season, 'label_encoder_season.pkl')
joblib.dump(label_encoder_bldg_class_at_sale, 'label_encoder_bldg_class_at_sale.pkl')
joblib.dump(label_encoder_bldg_class_cat, 'label_encoder_bldg_class_cat.pkl')
joblib.dump(label_encoder_bldg_class_present, 'label_encoder_bldg_class_present.pkl')
Out[ ]:
['label_encoder_bldg_class_present.pkl']
In [ ]:
# Save frequency encodings
joblib.dump(df['ADDRESS'].value_counts() / len(df), 'address_freq_map.pkl')
joblib.dump(df['NEIGHBORHOOD'].value_counts() / len(df), 'neighborhood_freq_map.pkl')
Out[ ]:
['neighborhood_freq_map.pkl']
In [ ]:
def encoding_with_frequency(df, column):
    # Calculate the frequency of each category
    frequency_encoding = df[column].value_counts() / len(df)

    # Map the frequencies to the original column
    df[column] = df[column].map(frequency_encoding)
    
encoding_with_frequency(df, 'ADDRESS')
encoding_with_frequency(df, 'NEIGHBORHOOD')
In [ ]:
# Create a correlation matrix
corr_matrix = df.corr()

# Set the figure size
plt.figure(figsize=(10, 10), facecolor='k')

# Set the style of the plot
sns.set_theme(style="dark")

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(250, 25, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
ax = sns.heatmap(corr_matrix, mask=mask, cmap=cmap, vmax=.3, center=0,
                 square=True, linewidths=.5, cbar_kws={"shrink": .5})

# Set the color of the labels to white and bring them to the front
ax.tick_params(colors='white', axis='y', labelrotation=0)
ax.tick_params(colors='white', axis='x', labelrotation=90)

# Change the color of the color bar ticks and labels
cbar = ax.collections[0].colorbar
cbar.ax.yaxis.set_tick_params(color='white')  # Change tick color
plt.setp(cbar.ax.get_yticklabels(), color='white')  # Change tick label color

# Set the title and its color
plt.title('Correlation Matrix', color='white')

plt.show()
No description has been provided for this image

Normal Positive Correlations:¶

TOTAL UNITS and RESIDENTIAL UNITS: There appears to be a strong positive correlation, indicating that as the number of residential units increases, the total units also increase, which makes sense because residential units contribute to the total unit count, But its not string relationship so no action needed.

Weak or No Correlations:¶

Many pairs of variables show little to no correlation, as indicated by values close to 0 in the matrix. This implies that those variables do not have a linear relationship.

High Negative Correlations:¶

Year and Month: The negative correlation could arise if these variables are being treated in a way that doesn't align with their typical meanings. For example, if Year is represented as a large number (e.g., 2023) and Month is represented as a smaller number (e.g., 1 for January, 12 for December), there might be an unintended mathematical relationship, but also no action needed>

In [ ]:
# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit the data
scaler = scaler.fit(df)

# Transform the data
scaled_data = scaler.transform(df)

# Create a DataFrame from the scaled data
df = pd.DataFrame(scaled_data, columns=df.columns)
df.describe()
Out[ ]:
BLOCK LOT ZIP CODE RESIDENTIAL UNITS COMMERCIAL UNITS TOTAL UNITS LAND SQUARE FEET GROSS SQUARE FEET SALE PRICE BOROUGH ... BUILDING CLASS AT PRESENT ADDRESS BUILDING CLASS AT TIME OF SALE TAX CLASS AT TIME OF SALE DayOfMonth Month Year DayOfWeek Season YEAR BUILT
count 82806.000000 82806.000000 82806.000000 82806.000000 82806.0 82806.000000 82806.000000 82806.000000 82806.000000 82806.000000 ... 82806.000000 82806.000000 82806.000000 82806.000000 82806.000000 82806.000000 82806.00000 82806.000000 82806.000000 82806.000000
mean 0.294635 0.169374 0.506870 0.215709 0.0 0.364325 0.491816 0.507263 0.313295 0.499520 ... 0.475090 0.024631 0.475745 0.215910 0.509863 0.506399 0.64953 0.546832 0.493897 0.586542
std 0.237415 0.266313 0.332221 0.188529 0.0 0.272165 0.129825 0.138052 0.209921 0.321786 ... 0.283006 0.100278 0.283930 0.268929 0.293995 0.315787 0.47712 0.379027 0.374666 0.201469
min 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000 0.000000
25% 0.096739 0.011641 0.180154 0.000000 0.0 0.333333 0.498828 0.487449 0.183206 0.250000 ... 0.210526 0.000000 0.210526 0.000000 0.266667 0.272727 0.00000 0.166667 0.000000 0.419162
50% 0.242212 0.027162 0.714117 0.200000 0.0 0.333333 0.498828 0.487449 0.325191 0.500000 ... 0.578947 0.000000 0.578947 0.333333 0.500000 0.454545 1.00000 0.666667 0.333333 0.538922
75% 0.437733 0.126940 0.800945 0.400000 0.0 0.666667 0.498828 0.487449 0.385536 0.750000 ... 0.631579 0.000000 0.631579 0.333333 0.766667 0.818182 1.00000 0.833333 1.000000 0.694611
max 1.000000 1.000000 1.000000 1.000000 0.0 1.000000 1.000000 1.000000 1.000000 1.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.00000 1.000000 1.000000 1.000000

8 rows × 23 columns

In [ ]:
# Save the scaler
joblib.dump(scaler, 'scaler.pkl')
Out[ ]:
['scaler.pkl']

3. Modeling¶

3.1 Splitting data¶

In [ ]:
y = df['SALE PRICE']
X = df.drop('SALE PRICE', axis=1)

# Splitting the Dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

3.2 Model Evaluation and Training¶

In [ ]:
# Define a dictionary of models and their hyperparameters for grid search
model_params = {
    'Linear Regression': {
        'model': LinearRegression(),
        'params': {}
    },
    'Ridge Regression': {
        'model': Ridge(),
        'params': {
            'model__alpha': [0.01, 0.1, 1.0, 10.0],
            'model__max_iter': [1000, 5000, 10000]
        }
    },
    'Lasso Regression': {
        'model': Lasso(),
        'params': {
            'model__alpha': [0.01, 0.1, 1.0, 10.0],
            'model__max_iter': [1000, 5000, 10000]
        }
    },
    'Decision Tree': {
        'model': DecisionTreeRegressor(),
        'params': {
            'model__max_depth': [5, 10, 15, 20],
            'model__min_samples_split': [2, 5, 10]
        }
    },
    'Random Forest': {
        'model': RandomForestRegressor(),
        'params': {
            'model__n_estimators': [50, 100, 200],
            'model__max_depth': [5, 10, 15],
            'model__min_samples_split': [2, 5, 10]
        }
    },
    'Gradient Boosting': {
        'model': GradientBoostingRegressor(),
        'params': {
            'model__n_estimators': [50, 100, 200],
            'model__learning_rate': [0.01, 0.1, 0.2],
            'model__max_depth': [3, 5, 7]
        }
    },
    'SGD Regressor': {
        'model': SGDRegressor(),
        'params': {
            'model__alpha': [0.0001, 0.001, 0.01, 0.1],
            'model__max_iter': [1000, 5000, 10000],
            'model__penalty': ['l2', 'l1', 'elasticnet'],
            'model__learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive']
        }
    }
}

# Initialize a dictionary to store the best models and results
best_models = {}
results = {}

# Perform RandomizedSearchCV for each model to reduce computation time
for name, model_info in model_params.items():
    start_time = time.time()
    # Create a pipeline with MinMaxScaler and the model
    pipeline = Pipeline([
        # ('scaler', MinMaxScaler()),
        ('model', model_info['model'])
    ])
    
    # Use RandomizedSearchCV for faster hyperparameter tuning
    randomized_search = RandomizedSearchCV(pipeline, model_info['params'], cv=5, 
                                           scoring='neg_mean_squared_error', n_jobs=-1, 
                                           n_iter=10,  # Set the number of iterations for random search
                                           random_state=42)
    randomized_search.fit(X_train, y_train)
    
    # Store the best model and its performance
    best_models[name] = randomized_search.best_estimator_
    results[name] = {
        'best_score': -randomized_search.best_score_,
        'best_params': randomized_search.best_params_
    }
    
    elapsed_time = time.time() - start_time
    print(f"{name}: Best MSE = {-randomized_search.best_score_}, Best Params = {randomized_search.best_params_} (Time: {elapsed_time:.2f}s)")

# Convert results to DataFrame for better readability
results_df = pd.DataFrame(results).T
print("\nSummary of Grid Search Results:")
print(results_df)

# Select the best model based on the mean cross-validation score (lower is better for MSE)
best_model_name = results_df['best_score'].idxmin()
best_model = best_models[best_model_name]

print(f"\nThe best model is {best_model_name} with a mean CV MSE of {results_df.loc[best_model_name, 'best_score']}.")

# Optionally, evaluate the best model on the test set 
y_pred = best_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred)
print(f"Test Set MSE for the best model ({best_model_name}): {test_mse}")
c:\Users\omarq\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_search.py:320: UserWarning:

The total space of parameters 1 is smaller than n_iter=10. Running 1 iterations. For exhaustive searches, use GridSearchCV.

Linear Regression: Best MSE = 0.04062624191455223, Best Params = {} (Time: 8.74s)
Ridge Regression: Best MSE = 0.04062567175441606, Best Params = {'model__max_iter': 5000, 'model__alpha': 10.0} (Time: 3.38s)
Lasso Regression: Best MSE = 0.04399376543126933, Best Params = {'model__max_iter': 5000, 'model__alpha': 10.0} (Time: 1.83s)
Decision Tree: Best MSE = 0.03364297302618432, Best Params = {'model__min_samples_split': 10, 'model__max_depth': 10} (Time: 8.18s)
Random Forest: Best MSE = 0.031053158443675576, Best Params = {'model__n_estimators': 50, 'model__min_samples_split': 10, 'model__max_depth': 15} (Time: 305.41s)
Gradient Boosting: Best MSE = 0.03111158332197923, Best Params = {'model__n_estimators': 200, 'model__max_depth': 7, 'model__learning_rate': 0.1} (Time: 339.89s)
SGD Regressor: Best MSE = 0.04062786380200895, Best Params = {'model__penalty': 'l2', 'model__max_iter': 5000, 'model__learning_rate': 'adaptive', 'model__alpha': 0.001} (Time: 39.29s)

Summary of Grid Search Results:
                  best_score  \
Linear Regression   0.040626   
Ridge Regression    0.040626   
Lasso Regression    0.043994   
Decision Tree       0.033643   
Random Forest       0.031053   
Gradient Boosting   0.031112   
SGD Regressor       0.040628   

                                                         best_params  
Linear Regression                                                 {}  
Ridge Regression     {'model__max_iter': 5000, 'model__alpha': 10.0}  
Lasso Regression     {'model__max_iter': 5000, 'model__alpha': 10.0}  
Decision Tree      {'model__min_samples_split': 10, 'model__max_d...  
Random Forest      {'model__n_estimators': 50, 'model__min_sample...  
Gradient Boosting  {'model__n_estimators': 200, 'model__max_depth...  
SGD Regressor      {'model__penalty': 'l2', 'model__max_iter': 50...  

The best model is Random Forest with a mean CV MSE of 0.031053158443675576.
Test Set MSE for the best model (Random Forest): 0.03115630115433893
In [ ]:
joblib.dump(best_model, 'nyc_property_price_model.pkl')
Out[ ]:
['nyc_property_price_model.pkl']
In [ ]:
df.head()
Out[ ]:
BLOCK LOT ZIP CODE RESIDENTIAL UNITS COMMERCIAL UNITS TOTAL UNITS LAND SQUARE FEET GROSS SQUARE FEET SALE PRICE BOROUGH ... BUILDING CLASS AT PRESENT ADDRESS BUILDING CLASS AT TIME OF SALE TAX CLASS AT TIME OF SALE DayOfMonth Month Year DayOfWeek Season YEAR BUILT
0 0.028526 0.002772 0.004725 1.0 0.0 0.333333 0.231641 0.487449 0.325191 0.0 ... 0.631579 0.0 0.631579 0.333333 0.600000 0.545455 1.0 1.000000 0.666667 0.299401
1 0.029036 0.013858 0.004725 0.2 0.0 0.333333 0.498828 0.487449 0.325191 0.0 ... 0.631579 0.0 0.631579 0.333333 0.433333 1.000000 0.0 1.000000 1.000000 0.299401
2 0.029036 0.021064 0.004725 0.2 0.0 0.333333 0.457813 0.487449 0.325191 0.0 ... 0.631579 0.0 0.631579 0.333333 0.266667 1.000000 0.0 0.000000 1.000000 0.299401
3 0.029255 0.011086 0.004725 0.2 0.0 0.333333 0.481250 0.487449 0.325191 0.0 ... 0.631579 0.0 0.631579 0.333333 0.733333 0.727273 0.0 0.000000 0.000000 0.377246
4 0.029401 0.029933 0.004725 0.2 0.0 0.333333 0.519141 0.487449 0.325191 0.0 ... 0.631579 0.0 0.631579 0.333333 0.533333 0.909091 0.0 0.666667 0.000000 0.299401

5 rows × 23 columns

In [ ]:
def group_rare_categories(df, column, unique_values, other_label="Other"):
    """
    Groups rare categories in a specified column of a DataFrame according to pre-determined unique values.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    column (str): The column in which to group rare categories.
    unique_values (list): The list of unique values to keep.
    other_label (str): The label to use for grouped categories.

    Returns:
    pd.DataFrame: A DataFrame with the rare categories grouped into 'Other'.
    """
    df[column] = df[column].apply(lambda x: x if x in unique_values else other_label)
    return df

# Function to preprocess the data
def preprocess_data(df):
    # Drop the 'Unnamed: 0' column
    if 'Unnamed: 0' in df.columns:
        df.drop('Unnamed: 0', axis=1, inplace=True)

    # Convert 'SALE DATE' to datetime format
    df['SALE DATE'] = pd.to_datetime(df['SALE DATE'])

    # Extract numeric features
    df['DayOfMonth'] = df['SALE DATE'].dt.day
    df['Month'] = df['SALE DATE'].dt.month
    df['Year'] = df['SALE DATE'].dt.year

    # Extract categorical features
    df['DayOfWeek'] = df['SALE DATE'].dt.day_name()
    df['Season'] = df['SALE DATE'].dt.month.apply(lambda x: 'Spring' if 3 <= x <= 5
                                                  else ('Summer' if 6 <= x <= 8
                                                        else ('Fall' if 9 <= x <= 11
                                                              else 'Winter')))
    df.drop('SALE DATE', axis=1, inplace=True)

    # Drop other unnecessary columns
    df.drop(['APARTMENT NUMBER', 'EASE-MENT'], axis=1, inplace=True)

    #Load and apply grouping of rare categories
    building_class_present_unique_values = joblib.load('building_class_present_unique_values.pkl')
    building_class_category_unique_values = joblib.load('building_class_category_unique_values.pkl')
    building_class_time_unique_values = joblib.load('building_class_time_unique_values.pkl')

    df = group_rare_categories(df, 'BUILDING CLASS AT PRESENT', building_class_present_unique_values)
    df = group_rare_categories(df, 'BUILDING CLASS CATEGORY', building_class_category_unique_values)
    df = group_rare_categories(df, 'BUILDING CLASS AT TIME OF SALE', building_class_time_unique_values)

    # Load label encoders
    le_dayofweek = joblib.load('label_encoder_dayofweek.pkl')
    le_season = joblib.load('label_encoder_season.pkl')
    le_bldg_class_at_sale = joblib.load('label_encoder_bldg_class_at_sale.pkl')
    le_bldg_class_cat = joblib.load('label_encoder_bldg_class_cat.pkl')
    le_bldg_class_present = joblib.load('label_encoder_bldg_class_present.pkl')
    
    # print(df.head())
    
    #Apply the encodings using the loaded label encoders
    df['DayOfWeek'] = le_dayofweek.transform(df['DayOfWeek'])
    df['Season'] = le_season.transform(df['Season'])
    df['BUILDING CLASS AT TIME OF SALE'] = le_bldg_class_at_sale.transform(df['BUILDING CLASS AT TIME OF SALE'])
    df['BUILDING CLASS CATEGORY'] = le_bldg_class_cat.transform(df['BUILDING CLASS CATEGORY'])
    df['BUILDING CLASS AT PRESENT'] = le_bldg_class_present.transform(df['BUILDING CLASS AT PRESENT'])

    # Apply frequency encoding
    def encoding_with_frequency(df, column, frequency_map):
        df[column] = df[column].map(frequency_map)
        return df

    # Load frequency encoding mappings
    address_freq_map = joblib.load('address_freq_map.pkl')
    neighborhood_freq_map = joblib.load('neighborhood_freq_map.pkl')

    df = encoding_with_frequency(df, 'ADDRESS', address_freq_map)
    df = encoding_with_frequency(df, 'NEIGHBORHOOD', neighborhood_freq_map)
    df.head()
    
    # Reorder columns the match data order before enter previuos scaling 
    new_order = ['BLOCK',
    'LOT',
    'ZIP CODE',
    'RESIDENTIAL UNITS',
    'COMMERCIAL UNITS',
    'TOTAL UNITS',
    'LAND SQUARE FEET',
    'GROSS SQUARE FEET',
    'SALE PRICE',
    'BOROUGH',
    'NEIGHBORHOOD',
    'BUILDING CLASS CATEGORY',
    'TAX CLASS AT PRESENT',
    'BUILDING CLASS AT PRESENT',
    'ADDRESS',
    'BUILDING CLASS AT TIME OF SALE',
    'TAX CLASS AT TIME OF SALE',
    'DayOfMonth',
    'Month',
    'Year',
    'DayOfWeek',
    'Season',
    'YEAR BUILT']
    
    df = df[new_order]
    
    # Load and apply the MinMaxScaler
    scaler = joblib.load('scaler.pkl')
    scaled_data = scaler.transform(df)

    # Convert scaled data back to DataFrame for easier handling
    df = pd.DataFrame(scaled_data, columns=df.columns)
    
    print(df['SALE PRICE'])
    return df

processed_data = preprocess_data(test_data_after_save_model.copy())
processed_data.drop('SALE PRICE', axis=1, inplace=True)

#Load the trained model
model = joblib.load('nyc_property_price_model.pkl')

# Make predictions on the preprocessed new data
predictions = model.predict(processed_data)
print(predictions)
0    0.274809
1    0.335878
2    0.280916
3    7.140969
4    0.042321
Name: SALE PRICE, dtype: float64
[0.35252914 0.33057194 0.29541667 0.4037396  0.31013076]